Practical Session 7: Forecasting

rm(list=ls())

# Install pacman if not installed already, and activate it
if (!require("pacman")) install.packages("pacman")

# Install, update and activate libraries
pacman::p_load(
  here, 
  rio, 
  skimr,
  tsibble,
  TSA,
  tidyverse,
  ciTools
)

# Create tsa_theme from previous exercise to be used in ggplot.
tsa_theme <- theme_bw() + 
        theme(
            plot.title = element_text(face = "bold", 
                                      size = 12),
            legend.background = element_rect(fill = "white", 
                                             size = 4, 
                                             colour = "white"),
            # legend.justification = c(0, 1),
            legend.position = "bottom",
            panel.grid.minor = element_blank()
        )
                                             


# Prepare ts data
#load(here("data", "mortagg2_case_6.Rdata"))
mortagg <- import(here("data", "mortagg2.csv"))

mortz <-
    mortagg %>%
    mutate(year_week = make_yearweek(year = year, week = week),
           index = seq.int(from = 1, to = nrow(mortagg)),
           sin52 = sin(2 * pi * week / 52), 
           cos52 = cos(2 * pi * week / 52),
           sin26 = sin(2 * pi * week / 26), 
           cos26 = cos(2 * pi * week / 26)) %>%
    as_tsibble(index = index)

# Model with trend and seasonality
mort_sine2cos2trendmodel <- glm(cases ~ index + sin52 + cos52 + sin26 + cos26,
                           family = "poisson",
                           data = mortz)

Expected learning outcomes

By the end of this session, participants should be able to: - understand the use of forecasting in public health surveillance data - forecast the expected number of cases of a disease into the future

Task 7.1

January 2020: Your boss received a phone call from the Ministry of Health. She is part of a committee that is responsible for setting the alert levels for mortality in Spain for the next year (2020). Before doing so, she gives you the task to forecast the total expected number of deaths for 2020 based on the historical data up to and including 2019.

Task 7.2 (Optional)

January 2021: A committee member is interested to get a better understanding of when there were periods of unusually high excess mortality and asks you to provide an analysis that highlights the time when these occurred.

Task 7.3 (Optional)

January 2021: your boss needs to inform the Ministry of Health about excess deaths so far during the first year of the pandemic.

Help for Task 7.1

ggplot(data = mortz) +
    geom_line(
        mapping = aes(x = year_week, y = cases),
        colour = "black",
        alpha = 1.2
    ) +
    geom_line(
        mapping = aes(x = year_week, y = mort_sine2cos2trendmodel$fitted),
        colour = "red",
        alpha = 1.2
    ) +
    scale_x_yearweek(date_labels = "%Y-%W", date_breaks = "8 weeks") +
    labs(x = "Year Week", y = "Weekly cases") +
    tsa_theme + 
    theme(axis.text.x = element_text(angle = 30, hjust = 1)) 

The dataset has records up to 2019-W52, and you would like to project the data (forecast) into 2020.

First, create a new time series object for 2020. Note that 2020 has 53 weeks.

pred.df <-
    tibble(
        year=2020,
        week  = 1:53,
        index = 521:(521+53-1), # add 53 weeks
        year_week = make_yearweek(year = year, week = week),
        sin52 = sin(2 * pi * week / 52),
        cos52 = cos(2 * pi * week / 52),
        sin26 = sin(2 * pi * week / 26),
        cos26 = cos(2 * pi * week / 26),
        pop = 47318050) %>%                # Spanish pop in 2020 (1st Jan)
    as_tsibble(index = index)

view(pred.df)

Calculate the expected values and 95% prediction intervals for each week of 2020 assuming that the Poisson regression model with trend and 2 seasonality terms based on the previous years is appropriate.

Predict and plot the expected number of deaths per week for 2020 with prediction intervals:

# apply mort_sine2cos2trendmodel to new data and predict cases with C.I. using bootstrapping.
# bootstrapping is a statistical method where you draw random samples from your data, 
# and analyze each of this samples.
# https://en.wikipedia.org/wiki/Bootstrapping_(statistics)


set.seed(12589)
pred.mort <- ciTools::add_ci(pred.df,
                            mort_sine2cos2trendmodel,
                            names=c("lPI", "uPI"),
                            yhatName="pred_cases",
                            response=TRUE,
                            type="boot",
                            nSims=1000)
head(pred.mort, 5)
  year week index year_week     sin52     cos52     sin26     cos26      pop
1 2020    1   521  2020 W01 0.1205367 0.9927089 0.2393157 0.9709418 47318050
2 2020    2   522  2020 W02 0.2393157 0.9709418 0.4647232 0.8854560 47318050
3 2020    3   523  2020 W03 0.3546049 0.9350162 0.6631227 0.7485107 47318050
4 2020    4   524  2020 W04 0.4647232 0.8854560 0.8229839 0.5680647 47318050
5 2020    5   525  2020 W05 0.5680647 0.8229839 0.9350162 0.3546049 47318050
  pred_cases      lPI       uPI
1   9668.633 9505.031  9847.676
2   9816.630 9649.887 10004.593
3   9917.123 9743.460 10116.956
4   9965.433 9788.115 10167.868
5   9959.873 9784.202 10162.163
ggplot(data = pred.mort) +
    geom_line(
        mapping = aes(x = year_week, y = pred_cases),
        colour = "red",
        alpha = 0.7
    ) +
    geom_ribbon(
        mapping = aes(x = year_week, ymin = lPI, ymax = uPI),
        fill = "red",
        alpha = 0.1
    ) +
    scale_y_continuous(limits = c(0, NA)) +
    scale_x_yearweek(date_labels = "%Y-%W", date_breaks = "4 weeks") +
    labs(x = "Year Week", y = "Predicted weekly fatalities") +
    tsa_theme + 
    theme(axis.text.x = element_text(angle = 30, hjust = 1)) 

Calculate the total number of expected deaths in 2020 in Spain.

# Total predicted cases in 2020, with lPI and uPI
pred.mort %>% 
  summarise(
    pred_cases_2020 = sum(pred_cases),
    pred_cases_2020_lPI = sum(lPI),
    pred_cases_2020_uPI = sum(uPI)
  )
  pred_cases_2020 pred_cases_2020_lPI pred_cases_2020_uPI
1        441385.8            435448.6            447514.4

Help for Task 7.2

CUSUM (cumulative sum) is a graphical method that can be used to determine when there is a change in a process (all-cause mortality in this example). In TSA it can also be used to decide whether there is a need to revise the model e.g. include a covariate or there have been changes in the seasonality.

Using expected values from the previous regression model, you can calculate the cumulative sum of the differences between the weekly observed and expected numbers of deaths:

mortz <- mortz %>%
  mutate(fit_cases = mort_sine2cos2trendmodel$fit,  # get predicted cases
         
         # calculate differences
         difference = cases - fit_cases,
         cumsum_excess = cumsum(difference),
         diff_zero = cases - mean(fit_cases),
         cumsum_zero = cumsum(diff_zero))

Plot this cumulative sum of the residuals.

ggplot(data = mortz) +
geom_line(
        mapping = aes(x = year_week, y = diff_zero),
        colour = "green",
        alpha = 0.7,
        lwd = 2
    ) +
    geom_line(
        mapping = aes(x = year_week, y = cumsum_zero),
        colour = "orange",
        alpha = 0.7,
        lwd = 2
    ) +
    scale_x_yearweek(date_labels = "%Y-%W", date_breaks = "1 year") +
    labs(x = "Year", y = "Cumulative excess cases") +
    tsa_theme

Help for Task 7.3

Plot the actual number of deaths and compare with the predictions from the model based on 2010-2019.

# load mortality data for 2020
mort2020 <- import(here("data", "mortagg2020.csv"))

# order mort2020 from week 1 to week 53, same as in pred.mort
mort2020 <- mort2020 %>% arrange(week)

# add actual deaths from 2020 in pred.mort
pred.mort <- pred.mort %>% 
  mutate(cases = mort2020$cases)

ggplot(data = pred.mort) +
    geom_line(
        mapping = aes(x = year_week, y = pred_cases),
        colour = "red",
        alpha = 0.7,
        lwd = 1.2
    ) +
    geom_ribbon(
        mapping = aes(x = year_week, ymin = lPI, ymax = uPI),
        fill = "red",
        alpha = 0.1
    ) +
    geom_line(
        mapping = aes(x = year_week, y = cases),
        colour = "black",
        alpha = 0.7,
        lwd = 1.2
    ) +
    scale_x_yearweek(date_labels = "%Y-%W", date_breaks = "4 weeks") +
    labs(x = "Year Week", y = "Predicted weekly cases") +
    tsa_theme + 
    theme(axis.text.x = element_text(angle = 30, hjust = 1)) 

Excess deaths can be calculated as the difference of the actual number of fatalities per week and the predicted mean, or the predicted upper 95% PI.

pred.mort <- pred.mort %>%
  mutate(
    difference_mean = cases - pred_cases,
    cusum_excess_mean = cumsum(difference_mean),
    difference_uPI = cases - uPI,
    cusum_excess_uPI = cumsum(difference_uPI)
  )


ggplot(data = pred.mort) +
    geom_line(
        mapping = aes(x = year_week, y = cusum_excess_mean),
        colour = "orange",
        alpha = 0.7,
        lwd = 2
    ) +
    geom_line(
        mapping = aes(x = year_week, y = cusum_excess_uPI),
        colour = "blue",
        alpha = 0.7,
        lwd = 2
    ) +   
    #scale_y_continuous(limits = c(-100, NA)) +
    scale_x_yearweek(date_labels = "%Y-%W", date_breaks = "4 weeks") +
    labs(x = "Year", y = "Cumulative excess cases") +
    tsa_theme

# save(list = ls(pattern = 'mort'), file = here("data", "mortagg2_case_7.RData"))
#load(here("data","mortagg2_case_7.RData"))